All packages used for analysis and visualization in this page:
# General data wrangling
library(tidyverse)
library(DT)
library(readxl)
# Visualization
library(plotly)
library(colorRamps)
library(RColorBrewer)
library(colorspace)
library(NeuralNetTools)
# Modeling
library(glmnet)
library(caret)
library(ranger)
# Ensemble building
library(caretEnsemble)
This page focuses on developing both individual models and model ensembles to predict annual change in CDR-Sum of Boxes based on rate of change in regional tau-PET accumulation, with age at baseline and sex included as covariates. This data was prepared in Data Preparation and is the non-PCA-transformed dataset. The same models are applied to PCA-transformed data in PCA Modeling to compare the results with the dimension-reduced and orthogonal principal components.
First, load in the data prepared in the previous data preparation phase:
load("../Prepared_Data.RData")
As per standard convention in model development, I will randomly partition this dataset into training and testing subsets, such that the models are trained and evaluated in independent data subsets. I will use the sample function with a specific seed set (127) to partition the data into 75% training and 25% testing.
# Set seed for consistency in random sampling for 10-foldcross-validation
set.seed(127)
train.index <- sample(nrow(annual.changes), nrow(annual.changes)*0.75, replace=F)
# Remove unneccessary identifier info from datasets for modeling
original <- annual.changes %>% ungroup() %>% select(-RID, -interval_num)
# Pre-processing will be applied in model training with caret
# Subset training + test data for original (ROI) data
original.train <- original[train.index, ]
original.test <- original[-train.index, ]
I will be using the caretEnsemble package to compile four individual regression models into a stacked ensemble. This package enables evaluation of the models individually as well as together in the ensemble.
The first step is to create a caretList of the four regression models I will use:
glmnet)knn)nnet)ranger)I will use the trainControl function to specify ten-fold cross-validation with parallel processing.
# trainControl for 10-fold CV and parallel processing
ensemble.control <- trainControl(method="cv", number=10, allowParallel=T)
I’m using the RMSE as the metric according to which model parameters are tuned. All input data (which includes training and test data) will be automatically standardized using the preProcess center + scaling feature.
# Set seed for consistency
set.seed(127)
# Neural net -- linout=T means linear output (i.e. not constrained to be [0,1]),
# try a range of hidden layer sizes and weight decays
my.neuralnet <- caretModelSpec(method="nnet", linout=T, trace=F, tuneGrid = expand.grid(size=c(1, 3, 5, 10, 15), decay = seq(0, 1, by=0.2)))
# k-nearest neighbors: try 20 different values of k
my.knn <- caretModelSpec(method="knn", tuneLength=20)
# random forest: try 15 different numbers of features considered at each node and use 500 sampled trees
my.randomforest <- caretModelSpec(method="ranger", tuneLength=15, num.trees=500, importance="permutation")
# elastic net: try four different values of alpha for ridge/lasso blending and four lambda values for coefficient penalty
my.elasticnet <- caretModelSpec(method="glmnet",tuneGrid=expand.grid(alpha=c(0,0.1,0.6,1), lambda=c(5^-5,5^-3,5^-1,1)))
# Compile individual models into one cohesive model list using caretList
invisible(capture.output(ensemble.models <- caretList(CDRSB ~ .,
data=original.train,
trControl=ensemble.control,
metric="RMSE",
preProcess=c("center", "scale"),
tuneList=list(my.neuralnet, my.knn, my.randomforest, my.elasticnet))))
The final chosen parameters for each model can be viewed:
Elastic netensemble.models$glmnet
# Elastic net cross-validation plot
enet.alpha <- ensemble.models$glmnet$results$alpha
enet.rmse <- ensemble.models$glmnet$results$RMSE
enet.r2 <- ensemble.models$glmnet$results$Rsquared
enet.lambda <- ensemble.models$glmnet$results$lambda
p.enet.cv <- data.frame(alpha=enet.alpha, RMSE=enet.rmse, R2=enet.r2, lambda=enet.lambda) %>%
mutate(alpha=as.character(alpha)) %>%
pivot_longer(cols=c(RMSE, R2), names_to="Metric", values_to="Value") %>%
mutate(Metric = ifelse(Metric=="R2", "Model R-Squared", "Model RMSE")) %>%
ggplot(data=., mapping=aes(x=lambda, y=Value, color=alpha)) +
facet_wrap(Metric ~ ., scales="free") +
geom_point() +
geom_line(aes(group=alpha)) +
theme_minimal() +
ggtitle("Elastic Net Regression Cross-Validated Results") +
theme(plot.title=element_text(hjust=0.5),
axis.title=element_blank(),
panel.spacing = unit(2, "lines"))
ggplotly(p.enet.cv) %>%
layout(yaxis = list(title = "R2",
titlefont = list(size = 12)),
xaxis = list(title = "lambda",
titlefont = list(size = 12)),
yaxis2 = list(title = "RMSE",
titlefont = list(size = 12)),
xaxis2 = list(title = "lambda",
titlefont = list(size = 12)))
rm(enet.alpha, enet.rmse, enet.lambda, enet.r2, p.enet.cv, train.index)
## glmnet
##
## 246 samples
## 33 predictor
##
## Pre-processing: centered (33), scaled (33)
## Resampling: Cross-Validated (10 fold)
## Summary of sample sizes: 221, 222, 221, 222, 222, 221, ...
## Resampling results across tuning parameters:
##
## alpha lambda RMSE Rsquared MAE
## 0.0 0.00032 1.0024825 0.2653441 0.6913281
## 0.0 0.00800 1.0024825 0.2653441 0.6913281
## 0.0 0.20000 0.9898643 0.2445495 0.6370114
## 0.0 1.00000 1.0343083 0.1746313 0.6488788
## 0.1 0.00032 1.0272172 0.2559091 0.7255175
## 0.1 0.00800 1.0173454 0.2591756 0.7119920
## 0.1 0.20000 1.0054750 0.2281684 0.6327570
## 0.1 1.00000 1.0779697 0.1152146 0.6739505
## 0.6 0.00032 1.0276861 0.2555838 0.7261108
## 0.6 0.00800 1.0089328 0.2615413 0.6961165
## 0.6 0.20000 1.0846309 0.1464500 0.6735706
## 0.6 1.00000 1.0728707 NaN 0.6855987
## 1.0 0.00032 1.0279578 0.2554543 0.7261422
## 1.0 0.00800 1.0031054 0.2639103 0.6829836
## 1.0 0.20000 1.0795456 0.0510474 0.6794389
## 1.0 1.00000 1.0728707 NaN 0.6855987
##
## RMSE was used to select the optimal model using the smallest value.
## The final values used for the model were alpha = 0 and lambda = 0.2.
kNN
ensemble.models$knn
# kNN cross-validation plot
knn.k <- ensemble.models$knn$results$k
knn.rmse <- ensemble.models$knn$results$RMSE
knn.r2 <- ensemble.models$knn$results$Rsquared
p.knn.cv <- data.frame(k=knn.k, RMSE=knn.rmse, R2=knn.r2) %>%
pivot_longer(cols=c(RMSE, R2), names_to="Metric", values_to="Value") %>%
mutate(Metric = ifelse(Metric=="R2", "Model R-Squared", "Model RMSE")) %>%
ggplot(data=., mapping=aes(x=k, y=Value, color=Metric)) +
facet_wrap(Metric ~ ., scales="free") +
geom_point() +
geom_line() +
theme_minimal() +
ggtitle("kNN Regression Cross-Validated Results") +
theme(plot.title=element_text(hjust=0.5),
axis.title=element_blank(),
panel.spacing = unit(2, "lines"))
ggplotly(p.knn.cv) %>%
layout(yaxis = list(title = "R2",
titlefont = list(size = 12)),
xaxis = list(title = "k",
titlefont = list(size = 12)),
yaxis2 = list(title = "RMSE",
titlefont = list(size = 12)),
xaxis2 = list(title = "k",
titlefont = list(size = 12)))
rm(knn.rmse, knn.k, p.knn.cv, knn.alpha, knn.lambda, knn.r2, train.index)
## k-Nearest Neighbors
##
## 246 samples
## 33 predictor
##
## Pre-processing: centered (33), scaled (33)
## Resampling: Cross-Validated (10 fold)
## Summary of sample sizes: 221, 222, 221, 222, 222, 221, ...
## Resampling results across tuning parameters:
##
## k RMSE Rsquared MAE
## 5 1.116258 0.05554699 0.6276505
## 7 1.091992 0.07131052 0.6108052
## 9 1.100593 0.07790373 0.6130387
## 11 1.095345 0.06069154 0.6068137
## 13 1.094842 0.05226503 0.6059138
## 15 1.090812 0.04770270 0.5984284
## 17 1.083195 0.03942269 0.5999945
## 19 1.074670 0.04885680 0.5935437
## 21 1.074273 0.05665069 0.5929702
## 23 1.077304 0.05419062 0.5951436
## 25 1.078761 0.04598779 0.5941601
## 27 1.077614 0.05117091 0.5940578
## 29 1.075366 0.05022101 0.5902141
## 31 1.077190 0.03886373 0.5887206
## 33 1.078476 0.04163973 0.5886809
## 35 1.078656 0.04222438 0.5902351
## 37 1.078481 0.04605131 0.5884794
## 39 1.078717 0.04757272 0.5901374
## 41 1.076745 0.04494799 0.5894965
## 43 1.077263 0.04513458 0.5898005
##
## RMSE was used to select the optimal model using the smallest value.
## The final value used for the model was k = 21.
Neural Network
ensemble.models$nnet
# Neural network cross-validation plot
n.neurons <- ensemble.models$nnet$results$size
nnet.rmse <- ensemble.models$nnet$results$RMSE
nnet.r2 <- ensemble.models$nnet$results$Rsquared
nnet.weight <- ensemble.models$nnet$results$decay
p.nnet.cv <- data.frame(n.neurons, RMSE=nnet.rmse, R2=nnet.r2, decay=nnet.weight) %>%
mutate(decay=as.character(decay)) %>%
pivot_longer(cols=c(RMSE, R2), names_to="Metric", values_to="Value") %>%
mutate(Metric = ifelse(Metric=="R2", "Model R-Squared", "Model RMSE")) %>%
ggplot(data=., mapping=aes(x=n.neurons, y=Value, color=decay)) +
facet_wrap(Metric ~ ., scales="free") +
geom_point() +
geom_line(aes(group=decay)) +
theme_minimal() +
ggtitle("Neural Network Regression Cross-Validated Results") +
theme(plot.title=element_text(hjust=0.5),
axis.title=element_blank(),
panel.spacing = unit(2, "lines"))
ggplotly(p.nnet.cv) %>%
layout(yaxis = list(title = "R2",
titlefont = list(size = 12)),
xaxis = list(title = "# Neurons in Hidden Layer",
titlefont = list(size = 12)),
yaxis2 = list(title = "RMSE",
titlefont = list(size = 12)),
xaxis2 = list(title = "# Neurons in Hidden Layer",
titlefont = list(size = 12)))
rm(n.neurons, nnet.rmse, nnet.r2, nnet.weight, p.nnet.cv)
## Neural Network
##
## 246 samples
## 33 predictor
##
## Pre-processing: centered (33), scaled (33)
## Resampling: Cross-Validated (10 fold)
## Summary of sample sizes: 221, 222, 221, 222, 222, 221, ...
## Resampling results across tuning parameters:
##
## size decay RMSE Rsquared MAE
## 1 0.0 1.328625 0.13757164 0.7145074
## 1 0.2 1.264576 0.21623652 0.6936230
## 1 0.4 1.217789 0.30336524 0.6717363
## 1 0.6 1.191167 0.27985600 0.6736006
## 1 0.8 1.137362 0.30955177 0.6643404
## 1 1.0 1.098219 0.34422851 0.6551192
## 3 0.0 1.741657 0.20942663 1.0281973
## 3 0.2 1.323120 0.14957709 0.9070708
## 3 0.4 1.270399 0.10796698 0.8759850
## 3 0.6 1.154506 0.18820124 0.7570883
## 3 0.8 1.248311 0.18326235 0.8263540
## 3 1.0 1.080586 0.26200460 0.7156278
## 5 0.0 1.687248 0.12168509 1.1217697
## 5 0.2 1.463661 0.08668264 0.9990230
## 5 0.4 1.361751 0.12156121 0.9220889
## 5 0.6 1.211507 0.15780435 0.8068860
## 5 0.8 1.206544 0.13201464 0.8103969
## 5 1.0 1.237265 0.19792066 0.8266016
## 10 0.0 1.795388 0.02723955 1.3591350
## 10 0.2 1.320734 0.13099843 0.9296914
## 10 0.4 1.321488 0.11481149 0.9128680
## 10 0.6 1.282147 0.08057191 0.8739253
## 10 0.8 1.243587 0.11041300 0.8242700
## 10 1.0 1.231847 0.07735076 0.8450555
## 15 0.0 1.641867 0.04385296 1.2404813
## 15 0.2 1.428081 0.08025674 0.9245121
## 15 0.4 1.327971 0.06838231 0.8721033
## 15 0.6 1.250592 0.12015134 0.8584150
## 15 0.8 1.213121 0.13089912 0.8159638
## 15 1.0 1.165197 0.16543419 0.7901996
##
## RMSE was used to select the optimal model using the smallest value.
## The final values used for the model were size = 3 and decay = 1.
Here’s a graphical representation of the caret-trained neural network:
par(mar = numeric(4))
plotnet(ensemble.models$nnet, cex_val=0.8, pad_x=0.6, pos_col="firebrick3", neg_col="dodgerblue4",
circle_col="lightslategray", bord_col="lightslategray", alpha_val=0.4)
Random Forest
ensemble.models$ranger
# Random forest cross-validation plot
splitrule <- ensemble.models$ranger$results$splitrule
numpred <- ensemble.models$ranger$results$mtry
rf.rmse <- ensemble.models$ranger$results$RMSE
rf.r2 <- ensemble.models$ranger$results$Rsquared
p.rf.cv <- data.frame(splitrule, RMSE=rf.rmse, R2=rf.r2, numpred) %>%
pivot_longer(cols=c(RMSE, R2), names_to="Metric", values_to="Value") %>%
mutate(Metric = ifelse(Metric=="R2", "Model R-Squared", "Model RMSE")) %>%
ggplot(data=., mapping=aes(x=numpred, y=Value, color=splitrule)) +
facet_wrap(Metric ~ ., scales="free") +
geom_point() +
geom_line(aes(group=splitrule)) +
theme_minimal() +
ggtitle("Random Forest Regression Cross-Validated Results") +
theme(plot.title=element_text(hjust=0.5),
axis.title=element_blank(),
panel.spacing = unit(2, "lines"))
ggplotly(p.rf.cv) %>%
layout(yaxis = list(title = "R2",
titlefont = list(size = 12)),
xaxis = list(title = "# Predictors in Decision Node",
titlefont = list(size = 12)),
yaxis2 = list(title = "RMSE",
titlefont = list(size = 12)),
xaxis2 = list(title = "# Predictors in Decision Node",
titlefont = list(size = 12)))
rm(splitrule, numpred, rf.rmse, rf.r2, p.rf.cv)
## Random Forest
##
## 246 samples
## 33 predictor
##
## Pre-processing: centered (33), scaled (33)
## Resampling: Cross-Validated (10 fold)
## Summary of sample sizes: 221, 222, 221, 222, 222, 221, ...
## Resampling results across tuning parameters:
##
## mtry splitrule RMSE Rsquared MAE
## 2 variance 1.064625 0.12159510 0.6450826
## 2 extratrees 1.048859 0.09012903 0.6413026
## 4 variance 1.084291 0.12387114 0.6533298
## 4 extratrees 1.055776 0.08186302 0.6453438
## 6 variance 1.099507 0.11633353 0.6585372
## 6 extratrees 1.051749 0.10265665 0.6381747
## 8 variance 1.109254 0.12427881 0.6619890
## 8 extratrees 1.062632 0.09373095 0.6534412
## 10 variance 1.115296 0.12228989 0.6648207
## 10 extratrees 1.065214 0.10195274 0.6497414
## 13 variance 1.115753 0.11651169 0.6595782
## 13 extratrees 1.061037 0.10938914 0.6461101
## 15 variance 1.127241 0.12307379 0.6670222
## 15 extratrees 1.063785 0.10712628 0.6501017
## 17 variance 1.138345 0.12177977 0.6716269
## 17 extratrees 1.068892 0.09720528 0.6493183
## 19 variance 1.134277 0.11795628 0.6700897
## 19 extratrees 1.075894 0.10123054 0.6558778
## 21 variance 1.142939 0.12049751 0.6717537
## 21 extratrees 1.071633 0.10974243 0.6459720
## 24 variance 1.150516 0.12156153 0.6760727
## 24 extratrees 1.073411 0.11032160 0.6513125
## 26 variance 1.146694 0.13039576 0.6744959
## 26 extratrees 1.073445 0.10615220 0.6524639
## 28 variance 1.151409 0.11795867 0.6719930
## 28 extratrees 1.077274 0.11107250 0.6552030
## 30 variance 1.151527 0.12254689 0.6722165
## 30 extratrees 1.088744 0.11102570 0.6542741
## 33 variance 1.155188 0.12900397 0.6747987
## 33 extratrees 1.085570 0.10789043 0.6583809
##
## Tuning parameter 'min.node.size' was held constant at a value of 5
## RMSE was used to select the optimal model using the smallest value.
## The final values used for the model were mtry = 2, splitrule = extratrees and min.node.size = 5.
These four models can be resampled and aggregated using the resamples function:
# Resample the performance of this ensemble and report summary metrics for MAE, RMSE, and R2
set.seed(127)
ensemble.results <- resamples(ensemble.models)
summary(ensemble.results)
##
## Call:
## summary.resamples(object = ensemble.results)
##
## Models: nnet, knn, ranger, glmnet
## Number of resamples: 10
##
## MAE
## Min. 1st Qu. Median Mean 3rd Qu. Max. NA's
## nnet 0.5034794 0.5548646 0.7226288 0.7156278 0.8724837 0.8959477 0
## knn 0.3477478 0.4566578 0.6387360 0.5929702 0.6904719 0.9059611 0
## ranger 0.4051231 0.5121434 0.6982644 0.6413026 0.7523551 0.9095211 0
## glmnet 0.3758580 0.4969080 0.6371010 0.6370114 0.7929573 0.8404093 0
##
## RMSE
## Min. 1st Qu. Median Mean 3rd Qu. Max. NA's
## nnet 0.6660448 0.7485084 1.0835559 1.0805864 1.247121 1.640732 0
## knn 0.5047511 0.6937286 1.1151631 1.0742732 1.370107 1.912549 0
## ranger 0.5232129 0.6800173 1.0803837 1.0488592 1.330690 1.821540 0
## glmnet 0.4601160 0.6378514 0.9067867 0.9898643 1.336987 1.735698 0
##
## Rsquared
## Min. 1st Qu. Median Mean 3rd Qu. Max. NA's
## nnet 0.0400650563 0.131463965 0.33016028 0.26200460 0.34807695 0.4577081 0
## knn 0.0007697474 0.006319066 0.02821930 0.05665069 0.05807266 0.2418623 0
## ranger 0.0009920488 0.016495845 0.06184173 0.09012903 0.15514295 0.2557172 0
## glmnet 0.0277449563 0.103045558 0.19118049 0.24454950 0.30601572 0.6654200 0
It’s also useful to look at the regression correlation between these component models:
cat("\nRMSE Correlation\n")
modelCor(ensemble.results, metric="RMSE")
cat("\n\nR2 Correlation\n")
modelCor(ensemble.results, metric="Rsquared")
##
## RMSE Correlation
## nnet knn ranger glmnet
## nnet 1.0000000 0.7691416 0.8088357 0.8862915
## knn 0.7691416 1.0000000 0.9941858 0.9496195
## ranger 0.8088357 0.9941858 1.0000000 0.9655972
## glmnet 0.8862915 0.9496195 0.9655972 1.0000000
##
##
## R2 Correlation
## nnet knn ranger glmnet
## nnet 1.0000000 0.2405107 0.6692003 0.5441721
## knn 0.2405107 1.0000000 0.5653012 0.3877298
## ranger 0.6692003 0.5653012 1.0000000 0.6270817
## glmnet 0.5441721 0.3877298 0.6270817 1.0000000
Finally, before moving on to stacked ensemble predictions, I’ll visualize this relative performance across the four models using the dotplot function:
library(grid)
library(gridExtra)
library(ggplotify)
p.rmse <- as.ggplot(dotplot(ensemble.results, metric="RMSE"))
p.r2 <- as.ggplot(dotplot(ensemble.results, metric="Rsquared"))
grid.arrange(p.rmse, p.r2, ncol=2)
Ideally, this ensemble would be comprised of models that show minimal correlation with each other and that offer larger R-squared values than can be seen here. Despite the high RMSE correlation and relatively weak performance across these four models, I’ll move forward to see if ensembling strengthens the overall predictions for the change in CDR-Sum of Boxes over time.
Starting with the basic caretEnsemble function, which employs a generalized linear model to combine the component models:
set.seed(127)
ensemble.control <- trainControl(method="repeatedcv", number=10, allowParallel = T)
stacked.ensemble.glm <- caretEnsemble(ensemble.models, metric="RMSE", trControl=ensemble.control)
summary(stacked.ensemble.glm)
## The following models were ensembled: nnet, knn, ranger, glmnet
## They were weighted:
## -0.0287 0.2595 -0.271 0.4013 0.5987
## The resulting RMSE is: 1.0158
## The fit for each individual model on the RMSE is:
## method RMSE RMSESD
## nnet 1.0805864 0.3570005
## knn 1.0742732 0.4443390
## ranger 1.0488592 0.4220274
## glmnet 0.9898643 0.4280575
Here’s a visual of this ensemble model:
plot(stacked.ensemble.glm)
Stacked ensembles can also be constructed using caretStack, which applies user-defined linear combinations of each constituent model. I’ll try one using a random forest combination of models and one using an elastic net combination of models.
set.seed(127)
stacked.ensemble.rf <- caretStack(ensemble.models, method = "rf", metric = "RMSE", trControl = ensemble.control)
stacked.ensemble.glmnet <- caretStack(ensemble.models, method="glmnet", metric="RMSE", trControl = ensemble.control)
cat("\nStacked ensemble, generalized linear model:\n")
##
## Stacked ensemble, generalized linear model:
stacked.ensemble.glm
## A glm ensemble of 4 base models: nnet, knn, ranger, glmnet
##
## Ensemble results:
## Generalized Linear Model
##
## 246 samples
## 4 predictor
##
## No pre-processing
## Resampling: Cross-Validated (10 fold, repeated 1 times)
## Summary of sample sizes: 222, 221, 222, 222, 221, 220, ...
## Resampling results:
##
## RMSE Rsquared MAE
## 1.015842 0.1806129 0.6286504
cat("\n\nStacked ensemble, random forest:\n")
##
##
## Stacked ensemble, random forest:
stacked.ensemble.rf
## A rf ensemble of 4 base models: nnet, knn, ranger, glmnet
##
## Ensemble results:
## Random Forest
##
## 246 samples
## 4 predictor
##
## No pre-processing
## Resampling: Cross-Validated (10 fold, repeated 1 times)
## Summary of sample sizes: 222, 221, 222, 222, 221, 220, ...
## Resampling results across tuning parameters:
##
## mtry RMSE Rsquared MAE
## 2 1.088156 0.1663274 0.6343777
## 3 1.090313 0.1593837 0.6389241
## 4 1.105520 0.1455903 0.6431148
##
## RMSE was used to select the optimal model using the smallest value.
## The final value used for the model was mtry = 2.
cat("\n\nStacked ensemble, elastic net:\n")
##
##
## Stacked ensemble, elastic net:
stacked.ensemble.glmnet
## A glmnet ensemble of 4 base models: nnet, knn, ranger, glmnet
##
## Ensemble results:
## glmnet
##
## 246 samples
## 4 predictor
##
## No pre-processing
## Resampling: Cross-Validated (10 fold, repeated 1 times)
## Summary of sample sizes: 222, 222, 222, 221, 222, 221, ...
## Resampling results across tuning parameters:
##
## alpha lambda RMSE Rsquared MAE
## 0.10 0.0008093379 1.032366 0.1937303 0.6374587
## 0.10 0.0080933788 1.032313 0.1938340 0.6373587
## 0.10 0.0809337883 1.029287 0.1961329 0.6336675
## 0.55 0.0008093379 1.032572 0.1935667 0.6376566
## 0.55 0.0080933788 1.032505 0.1934793 0.6373892
## 0.55 0.0809337883 1.028034 0.1961576 0.6308639
## 1.00 0.0008093379 1.032600 0.1935506 0.6376821
## 1.00 0.0080933788 1.032372 0.1933438 0.6373242
## 1.00 0.0809337883 1.028801 0.1950727 0.6297854
##
## RMSE was used to select the optimal model using the smallest value.
## The final values used for the model were alpha = 0.55 and lambda = 0.08093379.
Each of these three models show very comparable R-squared and MAE values. The RMSE is slighly lower for the glmnet-combined stack ensemble model, though this difference may not be significant and may not translate to the out-of-sample data.
These three ensemble models will be used to predict annual change in CDR-Sum of Boxes in the same test dataset. Note: since the models were created with center and scale preprocessing specified, the test data does not need to be manually pre-processed.
Model predictions using training data:
enet.train <- predict.train(ensemble.models$glmnet)
knn.train <- predict.train(ensemble.models$knn)
nnet.train <- predict.train(ensemble.models$nnet)
rf.train <- predict(ensemble.models$ranger)
ensemble.glm.train <- predict(stacked.ensemble.glm)
ensemble.glmnet.train <- predict(stacked.ensemble.glmnet)
ensemble.rf.train <- predict(stacked.ensemble.rf)
real.train <- original.train$CDRSB
train.df <- do.call(cbind, list(elastic.net=enet.train, knn=knn.train, neural.net=nnet.train, random.forest=rf.train,
ensemble.glm=ensemble.glm.train, ensemble.glmnet=ensemble.glmnet.train,
ensemble.rf=ensemble.rf.train, real.CDR=real.train)) %>% as.data.frame()
datatable(train.df %>% mutate_if(is.numeric, function(x) round(x,4)) %>% select(real.CDR, elastic.net:ensemble.rf))
Model predictions using test data:
enet.test <- predict.train(ensemble.models$glmnet, newdata=original.test)
knn.test <- predict.train(ensemble.models$knn, newdata=original.test)
nnet.test <- predict.train(ensemble.models$nnet, newdata=original.test)
rf.test <- predict.train(ensemble.models$ranger, newdata=original.test)
ensemble.glm.test <- predict(stacked.ensemble.glm, newdata=original.test)
ensemble.glmnet.test <- predict(stacked.ensemble.glmnet, newdata=original.test)
ensemble.rf.test <- predict(stacked.ensemble.rf, newdata=original.test)
real.test <- original.test$CDRSB
test.df <- do.call(cbind, list(elastic.net=enet.test, knn=knn.test, neural.net=nnet.test, random.forest=rf.test,
ensemble.glm=ensemble.glm.test, ensemble.glmnet=ensemble.glmnet.test, ensemble.rf=ensemble.rf.test,
real.CDR=real.test)) %>% as.data.frame()
datatable(test.df %>% mutate_if(is.numeric, function(x) round(x,4)))
These predictions can be compared with scatterplot visualizations:
p.train <- train.df %>%
pivot_longer(cols=c(-real.CDR), names_to="Model", values_to="Prediction") %>%
mutate(Model=factor(Model, levels=c("ensemble.glmnet", "ensemble.glm", "ensemble.rf", "elastic.net", "knn", "neural.net", "random.forest"))) %>%
ggplot(data=., mapping=aes(x=real.CDR, y= Prediction, color=Model)) +
geom_point(alpha=0.3) +
facet_grid(.~Model, scales="free") +
ggtitle("Model Predictions for CDR Sum of Boxes Annual Change in Training Data") +
theme_minimal() +
theme(legend.position="none") +
ylab("Predicted CDR-SoB Change") +
xlab("Actual CDR-SoB Change") +
theme(plot.title=element_text(hjust=0.5))
p.test <- test.df %>%
pivot_longer(cols=c(-real.CDR), names_to="Model", values_to="Prediction") %>%
mutate(Model=factor(Model, levels=c("ensemble.glmnet", "ensemble.glm", "ensemble.rf", "elastic.net", "knn", "neural.net", "random.forest"))) %>%
ggplot(data=., mapping=aes(x=real.CDR, y= Prediction, color=Model)) +
geom_point(alpha=0.3) +
facet_grid(.~Model, scales="free") +
ggtitle("Model Predictions for CDR Sum of Boxes Annual Change in Test Data") +
theme_minimal() +
theme(legend.position="none") +
ylab("Predicted CDR-SoB Change") +
xlab("Actual CDR-SoB Change") +
theme(plot.title=element_text(hjust=0.5))
ggplotly(p.train)
ggplotly(p.test)
Calculate the \(R^2\) and RMSE for the training & testing data across the models:
rmse.train <- data.frame(ensemble.glmnet=RMSE(ensemble.glmnet.train, real.train),
ensemble.glm=RMSE(ensemble.glm.train, real.train),
ensemble.rf=RMSE(ensemble.rf.train, real.train),
elastic.net=RMSE(knn.train, real.train),
knn=RMSE(knn.train, real.train),
neural.net=RMSE(nnet.train, real.train),
random.forest=RMSE(rf.train, real.train),
Metric="Train_RMSE")
rmse.test <- data.frame(ensemble.glmnet=RMSE(ensemble.glmnet.test, real.test),
ensemble.glm=RMSE(ensemble.glm.test, real.test),
ensemble.rf=RMSE(ensemble.rf.test, real.test),
elastic.net=RMSE(knn.test, real.test),
knn=RMSE(knn.test, real.test),
neural.net=RMSE(nnet.test, real.test),
random.forest=RMSE(rf.test, real.test),
Metric="Test_RMSE")
r2.train <- data.frame(ensemble.glmnet=R2(ensemble.glmnet.train, real.train),
ensemble.glm=R2(ensemble.glm.train, real.train),
ensemble.rf=R2(ensemble.rf.train, real.train),
elastic.net=R2(knn.train, real.train),
knn=R2(knn.train, real.train),
neural.net=R2(nnet.train, real.train),
random.forest=R2(rf.train, real.train),
Metric="Train_R2")
r2.test <- data.frame(ensemble.glmnet=R2(ensemble.glmnet.test, real.test),
ensemble.glm=R2(ensemble.glm.test, real.test),
ensemble.rf=R2(ensemble.rf.test, real.test),
elastic.net=R2(knn.test, real.test),
knn=R2(knn.test, real.test),
neural.net=R2(nnet.test, real.test),
random.forest=R2(rf.test, real.test),
Metric="Test_R2")
do.call(plyr::rbind.fill, list(rmse.train, rmse.test, r2.train, r2.test)) %>%
pivot_longer(cols=c(-Metric), names_to="Model", values_to="Value") %>%
mutate(Metric = str_replace(Metric, "_", " ")) %>%
pivot_wider(id_cols="Model", names_from="Metric", values_from="Value") %>%
mutate_if(is.numeric, function(x) round(x,4)) %>%
datatable()
overall.ensemble.results <- do.call(plyr::rbind.fill, list(rmse.train, rmse.test, r2.train, r2.test)) %>%
pivot_longer(cols=c(-Metric), names_to="Model", values_to="Value") %>%
separate(Metric, into=c("Data", "Metric"), sep="_")
p.ensemble.r2.rmse <- overall.ensemble.results %>%
mutate(Metric = ifelse(Metric=="RMSE", "Model Root Mean Square Error", "Model R-Squared")) %>%
mutate(Data=factor(Data, levels=c("Train", "Test")),
Model=factor(Model, levels=c("ensemble.glmnet", "ensemble.glm", "ensemble.rf", "elastic.net", "knn", "neural.net", "random.forest"))) %>%
ggplot(data=., mapping=aes(x=Data, y=Value, color=Model, group=Model)) +
geom_point() +
geom_line() +
theme_minimal() +
facet_wrap(Metric~., scales="free", nrow=1) +
theme(strip.text=element_text(size=12, face="bold"),
axis.title=element_blank())
ggplotly(p.ensemble.r2.rmse) %>%
layout(yaxis = list(title = "R2",
titlefont = list(size = 12)),
xaxis = list(title = "Data Subset",
titlefont = list(size = 12)),
yaxis2 = list(title = "RMSE",
titlefont = list(size = 12)),
xaxis2 = list(title = "Data Subset",
titlefont = list(size = 12)),
autosize = F, width = 1000, height = 400)